{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using Boruta on the Madalon Data Set\n",
    "Author: [Mike Bernico](mike.bernico@gmail.com)\n",
    "\n",
    "This example demonstrates using Boruta to find all relevant features in the Madalon dataset, which is an artificial dataset used in NIPS2003 and cited in the [Boruta paper](https://www.jstatsoft.org/article/view/v036i11/v36i11.pdf)\n",
    "\n",
    "This dataset has 2000 observations and 500 features.  We will use Boruta to identify the features that are relevant to the classification task.\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Installation\n",
    "#!pip install boruta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from boruta import BorutaPy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def load_data():\n",
    "    # URLS for dataset via UCI\n",
    "    train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'\n",
    "    train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'\n",
    "\n",
    "    X_data = pd.read_csv(train_data_url, sep=\" \", header=None)\n",
    "    y_data = pd.read_csv(train_label_url, sep=\" \", header=None)\n",
    "    data = X_data.loc[:, :499]\n",
    "    data['target'] = y_data[0]\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>491</th>\n",
       "      <th>492</th>\n",
       "      <th>493</th>\n",
       "      <th>494</th>\n",
       "      <th>495</th>\n",
       "      <th>496</th>\n",
       "      <th>497</th>\n",
       "      <th>498</th>\n",
       "      <th>499</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>485</td>\n",
       "      <td>477</td>\n",
       "      <td>537</td>\n",
       "      <td>479</td>\n",
       "      <td>452</td>\n",
       "      <td>471</td>\n",
       "      <td>491</td>\n",
       "      <td>476</td>\n",
       "      <td>475</td>\n",
       "      <td>473</td>\n",
       "      <td>...</td>\n",
       "      <td>481</td>\n",
       "      <td>477</td>\n",
       "      <td>485</td>\n",
       "      <td>511</td>\n",
       "      <td>485</td>\n",
       "      <td>481</td>\n",
       "      <td>479</td>\n",
       "      <td>475</td>\n",
       "      <td>496</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>483</td>\n",
       "      <td>458</td>\n",
       "      <td>460</td>\n",
       "      <td>487</td>\n",
       "      <td>587</td>\n",
       "      <td>475</td>\n",
       "      <td>526</td>\n",
       "      <td>479</td>\n",
       "      <td>485</td>\n",
       "      <td>469</td>\n",
       "      <td>...</td>\n",
       "      <td>478</td>\n",
       "      <td>487</td>\n",
       "      <td>338</td>\n",
       "      <td>513</td>\n",
       "      <td>486</td>\n",
       "      <td>483</td>\n",
       "      <td>492</td>\n",
       "      <td>510</td>\n",
       "      <td>517</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>487</td>\n",
       "      <td>542</td>\n",
       "      <td>499</td>\n",
       "      <td>468</td>\n",
       "      <td>448</td>\n",
       "      <td>471</td>\n",
       "      <td>442</td>\n",
       "      <td>478</td>\n",
       "      <td>480</td>\n",
       "      <td>477</td>\n",
       "      <td>...</td>\n",
       "      <td>481</td>\n",
       "      <td>492</td>\n",
       "      <td>650</td>\n",
       "      <td>506</td>\n",
       "      <td>501</td>\n",
       "      <td>480</td>\n",
       "      <td>489</td>\n",
       "      <td>499</td>\n",
       "      <td>498</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>480</td>\n",
       "      <td>491</td>\n",
       "      <td>510</td>\n",
       "      <td>485</td>\n",
       "      <td>495</td>\n",
       "      <td>472</td>\n",
       "      <td>417</td>\n",
       "      <td>474</td>\n",
       "      <td>502</td>\n",
       "      <td>476</td>\n",
       "      <td>...</td>\n",
       "      <td>480</td>\n",
       "      <td>474</td>\n",
       "      <td>572</td>\n",
       "      <td>454</td>\n",
       "      <td>469</td>\n",
       "      <td>475</td>\n",
       "      <td>482</td>\n",
       "      <td>494</td>\n",
       "      <td>461</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>484</td>\n",
       "      <td>502</td>\n",
       "      <td>528</td>\n",
       "      <td>489</td>\n",
       "      <td>466</td>\n",
       "      <td>481</td>\n",
       "      <td>402</td>\n",
       "      <td>478</td>\n",
       "      <td>487</td>\n",
       "      <td>468</td>\n",
       "      <td>...</td>\n",
       "      <td>479</td>\n",
       "      <td>452</td>\n",
       "      <td>435</td>\n",
       "      <td>486</td>\n",
       "      <td>508</td>\n",
       "      <td>481</td>\n",
       "      <td>504</td>\n",
       "      <td>495</td>\n",
       "      <td>511</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 501 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3    4    5    6    7    8    9   ...    491  492  493  \\\n",
       "0  485  477  537  479  452  471  491  476  475  473   ...    481  477  485   \n",
       "1  483  458  460  487  587  475  526  479  485  469   ...    478  487  338   \n",
       "2  487  542  499  468  448  471  442  478  480  477   ...    481  492  650   \n",
       "3  480  491  510  485  495  472  417  474  502  476   ...    480  474  572   \n",
       "4  484  502  528  489  466  481  402  478  487  468   ...    479  452  435   \n",
       "\n",
       "   494  495  496  497  498  499  target  \n",
       "0  511  485  481  479  475  496      -1  \n",
       "1  513  486  483  492  510  517      -1  \n",
       "2  506  501  480  489  499  498      -1  \n",
       "3  454  469  475  482  494  461       1  \n",
       "4  486  508  481  504  495  511       1  \n",
       "\n",
       "[5 rows x 501 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = data.pop('target')\n",
    "X = data.copy().values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Boruta conforms to the sklearn api and can be used in a Pipeline as well as on it's own. Here we will demonstrate stand alone operation.\n",
    "\n",
    "First we will instantiate an estimator that Boruta will use.  Then we will instantiate a Boruta Object."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rf = RandomForestClassifier(n_jobs=-1, class_weight=None, max_depth=7, random_state=0)\n",
    "# Define Boruta feature selection method\n",
    "feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once built, we can use this object to identify the relevant features in our dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feat_selector.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Boruta has confirmed only a few features as useful.   When our run ended, Boruta was undecided on 2 features.   '\n",
    "\n",
    "We can interrogate .support_ to understand which features were selected.   .support_ returns an array of booleans that we can use to slice our feature matrix to include only relevant columns.   Of course, .transform can also be used, as expected in the scikit API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check selected features\n",
    "print(feat_selector.support_)\n",
    "# Select the chosen features from our dataframe.\n",
    "selected = X[:, feat_selector.support_]\n",
    "print (\"\")\n",
    "print (\"Selected Feature Matrix Shape\")\n",
    "print (selected.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can also interrogate the ranking of the unselected features with .ranking_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feat_selector.ranking_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
